library(tidyverse)
library(anytime)
library(gganimate)
library(gifski)
library(ggthemes)
library(sf)
library(transformr)
library(ggrepel)
- This project is created for the BUS240 - Information Visualization
course, with the purpose of practicing and exploring the techniques of
data visualization using the R programming language.
- The dataset used for this project is the Uber-Lyft dataset from
Kaggle(https://www.kaggle.com/datasets/ravi72munde/uber-lyft-cab-prices).
- The project creates informative figures that present complex data in
a clear, concise, and easily understandable format.
- The project also explores two methods of creating animated figures:
- Utilizing the gganimate package to easily create visually appealing
animations.
- When it comes to geographic figures. The gganimate package didn’t
work.I found a way to generate each frame of the animation seperately
and combined them into a gif file. So that I managed to make a
geographic gif figure.
- This experience has greatly enhanced my skills in both R programming
and data visualization, which I believe will be beneficial in future
endeavors.
#import data set in dataframe format and do a basic filter.
df_origin<-read.csv("/Users/guangjitang/Downloads/uber data/cab_rides.csv")
df <- df_origin %>% filter(!is.na(price))
weather <- read.csv("/Users/guangjitang/Downloads/uber data/weather.csv")
load("/Users/guangjitang/test1/map.rda")
#get the middle spot of the polygon for each area
for (i in 1:length(map$Name)){
g=map$geometry[[i]]
g=g[[1]]
map$long_mid[i]=mean(g[,1])
map$lat_mid[i]=mean(g[,2])
}
# get the same name with df for future join
u=unique(df$source)
map$nname=NA
count_name=0
for (i in 1:length(map$Name)) {
for (j in 1:length(u)){
if (grepl(u[j],map$Name[i],ignore.case = T)){
map$nname[i]=u[j]
count_name=count_name+1
}
}
}
count_name
## [1] 11
#only 11 name was added sucessfully, add the last one manully
map$nname[6]="Haymarket Square"
#map2 saves the places of interest
map2 <- map %>%
filter(!is.na(nname)) %>%
select(nname,geometry,long_mid,lat_mid)
df_sample <- df #%>% sample_n(10000)# Sample for test only
df_plot <- df %>%
mutate(location=source)%>% # If care only about the weather of the source
mutate(time=anytime(time_stamp/1000))%>% #convert time stamp to time
mutate(time_hour=substr(time,1,10))%>% # select only the hours
mutate(hour = as.numeric(substr(time,12,13)))
#Categorizing each observation based on the time of the day, and add column "part_of_time" with this information
df_plot$part_of_time = "night"
df_plot$part_duration = 8
df_plot$part_of_time[df_plot$hour>5] = "morning"
df_plot$part_duration[df_plot$hour>5] = 5
df_plot$part_of_time[df_plot$hour>10] = "noon"
df_plot$part_duration[df_plot$hour>10] = 3
df_plot$part_of_time[df_plot$hour>13] = "afternoon"
df_plot$part_duration[df_plot$hour>13] = 5
df_plot$part_of_time[df_plot$hour>18] = "evening"
df_plot$part_duration[df_plot$hour>18] = 3
df_plot$part_of_time[df_plot$hour>21] = "night"
df_plot$part_duration[df_plot$hour>21] = 8
df_plot$nid=seq(1,length(df_plot$id))
weather <- weather %>%
mutate(time=anytime(time_stamp)) %>% #convert time stamp to time
mutate(time_hour=substr(time,1,10)) # select only the hours
#df_plot <- merge(df_plot,weather,by=c("time_hour","location"))
df_plot <- inner_join(df_plot,weather,by=c("time_hour","location"))
df_plot <- df_plot %>%
mutate(g_time=time.x-time.y) %>%
mutate(g_time=abs(g_time))
df_plot <- df_plot %>%
group_by(nid) %>%
arrange(g_time, .by_group = TRUE) %>%
top_n(1, g_time) %>% # now connect only to the most recent weather data
select(-g_time)
df_plot <- df_plot %>%
mutate(rainy=!is.na(rain))
save(df_plot,file = "df_plot.rda")
load(file ="df_plot.rda")
df_price <- df_plot %>% filter(distance>0.5) %>% mutate(avg_price = price/distance)
# df_price is prepared for those analysis considering average price by distance(avg_price = price/distance). Cab rides whose distance under 0.5 mile is not included since the avg_price will be extremely large and meaningless.
ggplot(df_plot,aes(distance,price,color=rainy))+facet_wrap("cab_type")+geom_point(size=1,alpha=0.3)+geom_smooth()+theme_economist()+scale_color_manual(values = c("#ffcc61","blue"))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

#price and distance by hour (animated)
df_plot_sample<- df_plot %>% filter(nid%%10 == 0)
ggplot(df_plot,aes(distance,price,color=cab_type))+
transition_time(hour)+
geom_point(data=df_plot_sample,size=1,alpha=0.5)+
geom_smooth()+
coord_cartesian(ylim = c(0,50))+
labs(title = "Cab out-in in Boston area",
caption = "In and out",fill="Out - in")+
labs(subtitle = "Hour: {frame_time}")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
